start_time <- Sys.time()
# Load helper packages
library(data.table)
library(plotly)
library(tidymodels)
library(hydrorecipes)
library(dlnm)
library(knitr)
# new names for predictors
nms_other <- c('datetime',
'precipitation',
'temperature_mean',
'temperature_min',
'temperature_max',
'sea_pressure',
'humidity',
'wind',
'insolation',
'evapotranspiration')Prediction Challenge - Germany
Lagged linear regression model
This model can be interpreted and refined based on the responses of each regressor group. It will not be the most accurate predictor, but it is fast and can help identify the most important components.
Set-up
Prepare data
outcome <- fread('data/Germany/heads.csv')
predictors <- fread('data/Germany/input_data.csv')
# make names more verbose
setnames(outcome, c('datetime', 'wl'))
setnames(predictors, nms_other)
# join data and make a numeric time column
dat <- outcome[predictors, on = 'datetime']
# join data and make a numeric time column
dat <- outcome[predictors, on = 'datetime']
# ad hoc estimate of water deficit. Use distributed lag model for predictors
dat[, deficit := cumsum(scale(precipitation)) - cumsum(scale(evapotranspiration))]
dat[, deficit := lm(deficit~splines::ns(datetime, df = 18))$residuals]
varknots <- dlnm::equalknots(dat$deficit, fun = "bs", degree = 3, df = 5)
lagknots <- logknots(365 * 3, 17)
cb <- crossbasis(dat$deficit, lag = 365 * 3,
argvar = list(fun = "bs", knots = varknots),
arglag = list(knots = lagknots))
dat <- cbind(dat, cb)
# date predictors
dat[, dow := lubridate::wday(datetime, label = FALSE)]
dat[, doy := lubridate::yday(datetime)]
dat[, mon := lubridate::month(datetime)]
# scaled precipitation based on ET
dat[, precip_evapo := precipitation * 1.0 / evapotranspiration]
# ad hoc snow melt
dat[, min_temp_diff := c(0, diff(temperature_min, lag = 1))]
dat[, snow_melt := 0]
dat[min_temp_diff >= 7 & month(datetime) %in% 1:3, snow_melt := 1]
dat[, snow_melt := pmin(snow_melt * precipitation, 1)]
# create feature dataset
all <- recipe(wl~., dat) |>
step_interact(terms = ~ precipitation:evapotranspiration) |>
step_interact(terms = ~ precipitation:temperature_mean) |>
step_distributed_lag(sea_pressure, knots = log_lags(5, 15)) |>
step_distributed_lag(precipitation, knots = log_lags(30, 180)) |>
step_distributed_lag(precip_evapo, knots = log_lags(21, 120)) |>
step_distributed_lag(snow_melt, knots = log_lags(35, 90)) |>
step_ns(humidity, deg_free = 20) |>
step_ns(wind, deg_free = 8) |>
step_ns(doy, deg_free = 13) |>
step_ns(mon, deg_free = 6) |>
step_ns(dow, deg_free = 4) |>
step_rm(datetime) |>
step_corr(all_predictors()) |>
prep() |>
bake(new_data = NULL)
setDT(all)Fit model
fit <- lm(wl~., all)Make predictions
dat <- cbind(dat, predict(fit, all, interval = "prediction"))Plot results
A median filter to smooth the results as they appeared to have too much variance.
p1 <- plot_ly(dat, x = ~datetime,
y = ~wl,
type = "scatter",
mode = "lines",
name = "Water Level",
line = list(color = "#808080")) |>
add_lines(x = ~datetime, y = ~runmed(fit, 7), name = "Predictions" ,
line = list(color = "#6000FF60"))
p2 <- plot_ly(dat, x = ~datetime,
y = ~wl - fit,
type = "scatter",
mode = "lines",
name = "Residuals",
line = list(color = "#808080"))
subplot(p1, p2, shareX = TRUE, nrows = 2)Output submission
submission_times <- fread("submissions/team_example/submission_form_Germany_lm.csv")
submission <- dat[datetime %in% submission_times$Date]
submission <- submission[, list(Date = datetime,
`Simulated Head` = fit,
`95% Lower Bound` = lwr,
`95% Upper Bound` = upr)]
fwrite(submission, "submission_form_Germany_lm.csv")
end_time <- Sys.time()Timings
Total elapsed time is 5.7 seconds.
Computer and software specs
Macbook Air M1 2020
16 GB Ram
sessionInfo()R version 4.2.1 (2022-06-23)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Ventura 13.1
Matrix products: default
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] knitr_1.41 dlnm_2.4.7 hydrorecipes_0.0.5
[4] yardstick_1.1.0 workflowsets_1.0.0 workflows_1.1.2
[7] tune_1.0.1 tidyr_1.2.1 tibble_3.1.8
[10] rsample_1.1.1 recipes_1.0.3 purrr_1.0.0
[13] parsnip_1.0.3 modeldata_1.0.1 infer_1.0.4
[16] dplyr_1.0.10 dials_1.1.0 scales_1.2.1
[19] broom_1.0.2 tidymodels_1.0.0.9000 plotly_4.10.1
[22] ggplot2_3.4.0 data.table_1.14.6
loaded via a namespace (and not attached):
[1] nlme_3.1-161 lubridate_1.9.0 DiceDesign_1.9
[4] httr_1.4.4 tools_4.2.1 backports_1.4.1
[7] utf8_1.2.2 R6_2.5.1 rpart_4.1.19
[10] mgcv_1.8-41 DBI_1.1.3 lazyeval_0.2.2
[13] colorspace_2.0-3 nnet_7.3-18 withr_2.5.0
[16] tidyselect_1.2.0 compiler_4.2.1 cli_3.5.0
[19] stringr_1.5.0 digest_0.6.31 rmarkdown_2.19
[22] pkgconfig_2.0.3 htmltools_0.5.4 parallelly_1.33.0
[25] lhs_1.1.6 fastmap_1.1.0 collapse_1.8.9
[28] htmlwidgets_1.6.0 rlang_1.0.6 rstudioapi_0.14
[31] generics_0.1.3 jsonlite_1.8.4 crosstalk_1.2.0
[34] magrittr_2.0.3 Matrix_1.5-3 Rcpp_1.0.9
[37] munsell_0.5.0 fansi_1.0.3 GPfit_1.0-8
[40] tsModel_0.6-1 lifecycle_1.0.3 furrr_0.3.1
[43] stringi_1.7.8 yaml_2.3.6 MASS_7.3-58.1
[46] grid_4.2.1 parallel_4.2.1 listenv_0.9.0
[49] lattice_0.20-45 earthtide_0.0.14 splines_4.2.1
[52] pillar_1.8.1 fftw_1.0-7 future.apply_1.10.0
[55] codetools_0.2-18 glue_1.6.2 evaluate_0.19
[58] RcppParallel_5.1.5 vctrs_0.5.1 foreach_1.5.2
[61] gtable_0.3.1 future_1.30.0 assertthat_0.2.1
[64] xfun_0.36 gower_1.0.1 prodlim_2019.11.13
[67] class_7.3-20 survival_3.4-0 viridisLite_0.4.1
[70] timeDate_4021.107 iterators_1.0.14 hardhat_1.2.0
[73] lava_1.7.0 timechange_0.1.1 globals_0.16.2
[76] ellipsis_0.3.2 ipred_0.9-13